This is a quick r script I wrote to merge the phenotype files and then analyze.
library(ggplot2)
library(tidyverse)
library(asreml)
library(psych)
library(knitr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:emmeans':
##
## pigs
setwd("C:/Users/zwinn/OneDrive/University of Arkansas/Post Graduation Work")
hgawn<-read.csv("HGAWN_Aligned_Data.csv")
hgawn<-hgawn %>%
filter(!Variety=="6-Row-Barley") %>%
select(Tray, Rep, Variety,
Days_to_heading, Date_of_anther_sampling,
Area_of_anthers_per_spike, Number_of_anthers_per_spike,
Area_per_anther)
colnames(hgawn)[4:5]<-c("Heading_Date", "Anthesis_Date")
hgawn[,1:3]<-lapply(hgawn[,1:3], as.factor)
hgawn[,4:ncol(hgawn)]<-lapply(hgawn[,4:ncol(hgawn)], as.numeric)
lapply(hgawn[,1:ncol(hgawn)], class)
## $Tray
## [1] "factor"
##
## $Rep
## [1] "factor"
##
## $Variety
## [1] "factor"
##
## $Heading_Date
## [1] "numeric"
##
## $Anthesis_Date
## [1] "numeric"
##
## $Area_of_anthers_per_spike
## [1] "numeric"
##
## $Number_of_anthers_per_spike
## [1] "numeric"
##
## $Area_per_anther
## [1] "numeric"
head(hgawn)
## Tray Rep Variety Heading_Date Anthesis_Date Area_of_anthers_per_spike
## 1 1 1 NC05-21642 116 119 15.04
## 2 1 1 LA01110D-251 116 121 3.59
## 3 1 1 AR06012-6-3 116 120 13.72
## 4 1 1 TX13D5217 122 NA NA
## 5 1 1 VA12W-102 117 120 8.09
## 6 1 1 AR04119-3 114 NA NA
## Number_of_anthers_per_spike Area_per_anther
## 1 33.25 0.4523308
## 2 7.25 0.4951724
## 3 29.00 0.4731034
## 4 NA NA
## 5 16.00 0.5056250
## 6 NA NA
Here I loop through the to perform the following mixed linear model in ASReml:
\(y=V+R+\varepsilon\)
Where y is the response, V is the variety which is fixed, R is the rep which is the blocking factor and random, an \(\varepsilon\) is the residual error.
#set up traits
traits<-colnames(hgawn[4:ncol(hgawn)])
#create dataframe to bind onto
preds<-data.frame(Variety=unique(hgawn$Variety))
#create dataframe for AIC, BIC, and LogLik
crit<-data.frame(row.names = c("AIC", "BIC", "LogLik"))
#look at distribution
for (i in traits){
hist(hgawn[,i], main = paste("Distribution of", i), xlab = i)
boxplot(hgawn[,i], main=i)
}
#loop MLMs for each trait and pull BLUEs
for (i in traits){
#announce step
print(paste("------- Analyzing trait", i, "-------"))
#run model
mlm<-asreml(fixed=hgawn[,i]~1+Variety,
random=~Rep,
units=~idv(units),
data=hgawn)
#display anova table
print(wald.asreml(mlm))
#look at residuals
print(plot(resid(mlm), main=paste("Residuals of", i, "Analysis")))
#pull out fit criteria
c<-as.data.frame(rbind(summary.asreml(mlm)$aic, summary.asreml(mlm)$bic, summary.asreml(mlm)$loglik))
colnames(c)<-i
crit<-cbind(crit,c)
#Pull out fixed effects
p<-predict(mlm, classify = "Variety")$pvals[,1:2]
colnames(p)[2]<-i
preds<-left_join(preds, p)
#remove
remove(mlm,c,p)
}
## [1] "------- Analyzing trait Heading_Date -------"
## License check Thu Mar 11 14:58:41 2021
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:42 2021
## LogLik Sigma2 DF wall cpu
## 1 -583.966 1.47951 572 14:58:42 0.1
## 2 -583.796 1.47814 572 14:58:42 0.0
## 3 -583.625 1.47650 572 14:58:42 0.0
## 4 -583.519 1.47496 572 14:58:42 0.0
## 5 -583.503 1.47437 572 14:58:42 0.0
## 6 -583.503 1.47425 572 14:58:42 0.0
## [0;34mWald tests for fixed effects.[0m
## [0;34mResponse: hgawn[, i][0m
## [0;34mTerms added sequentially; adjusted for those above.[0m
##
## Df Sum of Sq Wald statistic Pr(Chisq)
## (Intercept) 1 89918 60992 < 2.2e-16 ***
## Variety 580 1761 1195 < 2.2e-16 ***
## residual (MS) 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## NULL
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:43 2021
## LogLik Sigma2 DF wall cpu
## 1 -583.503 1.47425 572 14:58:43 0.1
## 2 -583.503 1.47425 572 14:58:43 0.0
## 3 -583.503 1.47425 572 14:58:43 0.3
## Joining, by = "Variety"
## [1] "------- Analyzing trait Anthesis_Date -------"
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:43 2021
## LogLik Sigma2 DF wall cpu
## 1 -309.475 1.19216 341 14:58:43 0.1
## 2 -309.453 1.19258 341 14:58:43 0.0
## 3 -309.442 1.19354 341 14:58:43 0.0
## 4 -309.441 1.19331 341 14:58:43 0.0
## [0;34mWald tests for fixed effects.[0m
## [0;34mResponse: hgawn[, i][0m
## [0;34mTerms added sequentially; adjusted for those above.[0m
##
## Df Sum of Sq Wald statistic Pr(Chisq)
## (Intercept) 1 424528 355756 < 2.2e-16 ***
## Variety 492 928 778 3.109e-15 ***
## residual (MS) 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## NULL
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:43 2021
## LogLik Sigma2 DF wall cpu
## 1 -309.441 1.19330 341 14:58:44 0.1
## 2 -309.441 1.19330 341 14:58:44 0.0
## 3 -309.441 1.19330 341 14:58:44 0.2
## Joining, by = "Variety"
## [1] "------- Analyzing trait Area_of_anthers_per_spike -------"
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:44 2021
## LogLik Sigma2 DF wall cpu
## 1 -818.892 23.9949 340 14:58:44 0.1 (1 restrained)
## 2 -818.455 24.1021 340 14:58:44 0.0
## 3 -818.385 24.0683 340 14:58:44 0.0
## 4 -818.378 24.0567 340 14:58:44 0.0
## Warning in asreml(fixed = hgawn[, i] ~ 1 + Variety, random = ~Rep, units =
## ~idv(units), : Some components changed by more than 1% on the last iteration.
## [0;34mWald tests for fixed effects.[0m
## [0;34mResponse: hgawn[, i][0m
## [0;34mTerms added sequentially; adjusted for those above.[0m
##
## Df Sum of Sq Wald statistic Pr(Chisq)
## (Intercept) 1 9913.3 412.08 < 2.2e-16 ***
## Variety 494 15322.2 636.92 1.389e-05 ***
## residual (MS) 24.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## NULL
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:44 2021
## LogLik Sigma2 DF wall cpu
## 1 -818.377 24.0549 340 14:58:44 0.1
## 2 -818.377 24.0549 340 14:58:44 0.0
## 3 -818.377 24.0548 340 14:58:45 0.2
## Joining, by = "Variety"
## [1] "------- Analyzing trait Number_of_anthers_per_spike -------"
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:45 2021
## LogLik Sigma2 DF wall cpu
## 1 -1053.386 95.3181 340 14:58:45 0.1 (1 restrained)
## 2 -1052.790 95.6540 340 14:58:45 0.0
## 3 -1052.770 95.5823 340 14:58:45 0.0
## 4 -1052.769 95.5661 340 14:58:45 0.0
## [0;34mWald tests for fixed effects.[0m
## [0;34mResponse: hgawn[, i][0m
## [0;34mTerms added sequentially; adjusted for those above.[0m
##
## Df Sum of Sq Wald statistic Pr(Chisq)
## (Intercept) 1 63488 664.33 < 2.2e-16 ***
## Variety 492 58067 607.61 0.0002784 ***
## residual (MS) 96
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## NULL
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:45 2021
## LogLik Sigma2 DF wall cpu
## 1 -1052.769 95.5652 340 14:58:45 0.1
## 2 -1052.769 95.5652 340 14:58:45 0.0
## 3 -1052.769 95.5652 340 14:58:45 0.2
## Joining, by = "Variety"
## [1] "------- Analyzing trait Area_per_anther -------"
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:45 2021
## LogLik Sigma2 DF wall cpu
## 1 598.832 0.00573189 340 14:58:45 0.1 (1 restrained)
## 2 599.481 0.00575030 340 14:58:45 0.0
## 3 599.490 0.00574741 340 14:58:45 0.0
## 4 599.490 0.00574694 340 14:58:45 0.0
## [0;34mWald tests for fixed effects.[0m
## [0;34mResponse: hgawn[, i][0m
## [0;34mTerms added sequentially; adjusted for those above.[0m
##
## Df Sum of Sq Wald statistic Pr(Chisq)
## (Intercept) 1 36.386 6331.4 < 2.2e-16 ***
## Variety 491 7.467 1299.3 < 2.2e-16 ***
## residual (MS) 0.006
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## NULL
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:46 2021
## LogLik Sigma2 DF wall cpu
## 1 599.490 0.00574693 340 14:58:46 0.1
## 2 599.490 0.00574693 340 14:58:46 0.0
## 3 599.490 0.00574693 340 14:58:46 0.2
## Joining, by = "Variety"
#look at the fit criteria
kable(crit, caption="Fit Criteria of RCBD for Each Trait")
| Heading_Date | Anthesis_Date | Area_of_anthers_per_spike | Number_of_anthers_per_spike | Area_per_anther | |
|---|---|---|---|---|---|
| AIC | 1171.0050 | 622.8826 | 1640.7552 | 2109.539 | -1194.9796 |
| BIC | 1179.7033 | 630.5464 | 1648.4131 | 2117.197 | -1187.3217 |
| LogLik | -583.5025 | -309.4413 | -818.3776 | -1052.769 | 599.4898 |
#write out predictions
write.csv(preds,"BLUEs_2018_2019_HGAWN.csv", row.names = F)
#remove outlier for correlation study
preds<-preds[-as.numeric(rownames(preds)[which.max(preds$Area_of_anthers_per_spike)]),]
#make a pretty figure with no underscore
fig<-preds
colnames(fig)<-c("Variety","HD","AD","AOAPS","NOAPS","APA")
#display the raw values of the correlation
kable(cor(fig[,2:ncol(fig)], use="complete.obs"), caption="Correlations Among BLUEs")
| HD | AD | AOAPS | NOAPS | APA | |
|---|---|---|---|---|---|
| HD | 1.0000000 | 0.7312329 | -0.0820714 | -0.1639121 | 0.1454097 |
| AD | 0.7312329 | 1.0000000 | -0.2889538 | -0.3971774 | 0.1214169 |
| AOAPS | -0.0820714 | -0.2889538 | 1.0000000 | 0.8968776 | 0.4666715 |
| NOAPS | -0.1639121 | -0.3971774 | 0.8968776 | 1.0000000 | 0.0659067 |
| APA | 0.1454097 | 0.1214169 | 0.4666715 | 0.0659067 | 1.0000000 |
#figure for publication
a<-ggpairs(fig[,2:ncol(fig)],
lower = list(continous=wrap("smooth", color="blue")),
diag = list(continous="barDiag"),
upper = list(continous="cor"))
a+theme(legend.position = "none",
panel.grid.major = element_blank(),
axis.ticks.x = element_blank(),
panel.border = element_rect(linetype = "dashed", colour = "black", fill = NA))
## Warning: Removed 16 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 104 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 102 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 104 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 104 rows containing missing values (geom_point).
## Warning: Removed 104 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 104 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 102 rows containing missing values (geom_point).
## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 102 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 104 rows containing missing values (geom_point).
## Warning: Removed 104 rows containing missing values (geom_point).
## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 104 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 105 rows containing non-finite values (stat_density).
#create a correlation matrix
S<-cor(fig[,2:ncol(fig)], use = "complete.obs")
#perform eigen value decomposition on the correlation matrix
V<-eigen(S)$vectors
#use prcomp to perform PCA
pca_S<-prcomp(V)
#summary of PCA
summary(pca_S)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 0.50 0.50 0.50 0.50 1.277e-17
## Proportion of Variance 0.25 0.25 0.25 0.25 0.000e+00
## Cumulative Proportion 0.25 0.50 0.75 1.00 1.000e+00
#pull loadings
load<-pca_S$rotation
#rename the rownames
rownames(load)<-rownames(S)
#change to dataframe
load<-as.data.frame(load)
#change point size
update_geom_defaults("point",list(size=4))
#2D plot loadings
ggplot(data = load, aes(x=PC1, y=PC2, color=rownames(load)))+
geom_point()+
theme(legend.title=element_blank())
#library
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#3D plot of loadings
a<-plot_ly(data = load, x=~PC1, y=~PC2, z=~PC3, color = ~rownames(load)) %>%
add_markers()
a